library(tidyverse)
library(cowplot)
library(GGally)
library(heatmaply)
library(sva)
library(limma)
library(broom)
library(ggridges)
# Cells #
vpa.cell.neg.raw <- read_csv("./data/abundances/vpa_1and2_cells_target_negmode.csv")
vpa.cell.pos.raw <- read_csv("./data/abundances/vpa_1and2_cells_target_posmode.csv")
# Media #
vpa.med.neg.raw <- read_csv("./data/abundances/vpa_1and2_media_target_negmode.csv")
vpa.med.pos.raw <- read_csv("./data/abundances/vpa_1and2_media_target_posmode.csv")
# Cells #
vpa.cell.neg.compound.info <- read_csv("./data/compound_info/vpa_1and2_cells_target_negmode_cmpnd_info.csv")
vpa.cell.pos.compound.info <- read_csv("./data/compound_info/vpa_1and2_cells_target_posmode_cmpnd_info.csv")
# Media #
vpa.med.neg.compound.info <- read_csv("./data/compound_info/vpa_1and2_media_target_negmode_cmpnd_info.csv")
vpa.med.pos.compound.info <- read_csv("./data/compound_info/vpa_1and2_media_target_posmode_cmpnd_info.csv")
# Kegg/other ID reference #
cmpnd.id.db <- read_csv("./data/other/anp_db_compound_kegg.csv")
MissingPerSamplePlot <- function(raw.data, start.string) {
# Counts the number of missing/NA values per sample and
# percent compounds missing out of total number of compounds per sample
# Then passes the result into a vertical bar plot, where each
# bar represents a single sample and the size of the bar
# is the % of compounds missing
counted.na <- raw.data %>%
select(starts_with(start.string)) %>%
mutate(
count.na = apply(., 1, function(x) sum(is.na(x))),
percent.na = (count.na / ncol(raw.data %>% select(starts_with(start.string)))) * 100
) %>%
dplyr::select(count.na, percent.na) %>%
bind_cols(
raw.data %>%
select(Samples, Group)
) %>%
arrange(percent.na) %>%
mutate(f.order = factor(Samples, levels = Samples))
counted.na %>%
ggplot(aes(x = f.order, y = percent.na, fill = Group)) +
geom_bar(stat = "identity") +
geom_hline(yintercept = 20, color = "gray", size = 1, alpha = 0.8) +
coord_flip()+
xlab("Samples") +
ylab("Percent missing values in sample") +
theme(axis.text.y = element_text(size = 6))
}
MissingPerCompound <- function(raw.data, start.string){
# Function to count in how many experimental samples each compound is missing
# Also calculates the % missing:
# (# NA per compound across all experimental samples) * 100 / (tot num of samples)
raw.data %>%
filter(Group == "sample") %>%
select(Samples, starts_with(start.string)) %>%
gather(key = "Compound", value = "Abundance", -Samples) %>%
group_by(Compound) %>%
summarise(
na_count = sum(is.na(Abundance)),
n_samples = n(),
percent_na = (na_count * 100 / n_samples)
) %>%
filter(na_count > 0) %>%
arrange(desc(percent_na))
}
ReplaceNAwMinLogTransformSingle <- function(raw.dataframe, start.prefix) {
# Function to replace any NAs in each column with the minimum for that column,
# separately for each sample type.
# NA in negative control samples are replaced by 2.
# Then data is log2 transformed
smpls <- raw.dataframe %>%
filter(Group == "sample") %>%
dplyr::select(starts_with(start.prefix))
smpls.min <- lapply(smpls, min, na.rm = TRUE)
# replace the missing values in the real samples with the minimum of the samples
# then take the log
smpls.noNA <- raw.dataframe %>%
filter(Group == "sample") %>%
dplyr::select(Samples:Experiment) %>%
bind_cols(
smpls %>%
replace_na(replace = smpls.min) %>%
log2()
)
# QC
QC <- raw.dataframe %>%
filter(Group == "QC") %>%
dplyr::select(starts_with(start.prefix))
QC.min <- lapply(QC, min, na.rm = TRUE)
# replace the missing values in the QC with the minimum of the QC
# then take the log
QC.noNA <- raw.dataframe %>%
filter(Group == "QC") %>%
dplyr::select(Samples:Experiment) %>%
bind_cols(
QC %>%
replace_na(replace = QC.min) %>%
log2()
)
# replace the missing values in solv and empty samples with 2 - for PCA analysis
# then take the log
other.min <- setNames(
as.list(
rep(2, ncol(
raw.dataframe %>%
dplyr::select(starts_with(start.prefix))))
),
colnames(raw.dataframe %>% dplyr::select(starts_with(start.prefix)))
)
other.num.log <- raw.dataframe %>%
filter(Group != "sample" & Group != "QC") %>%
dplyr::select(Samples:Experiment) %>%
bind_cols(
raw.dataframe %>%
filter(Group != "sample" & Group != "QC") %>%
dplyr::select(starts_with(start.prefix)) %>%
replace_na(replace = other.min) %>%
log2()
)
# combine them together back into one data frame
all.noNA <- smpls.noNA %>%
bind_rows(QC.noNA) %>%
bind_rows(other.num.log)
}
HeatmapPrepAlt <- function(raw.data, start.prefix){
# function for preparing dara for heatmap viz
x <- raw.data %>%
select(starts_with(start.prefix)) %>%
scale(center = TRUE, scale = TRUE) %>%
as.matrix()
row.names(x) <- raw.data$Samples
return(x)
}
Q: What are the distributions of compound masses and retention times?
# bind all 4 compound info df into 1
full.vpa.cmpnd <- vpa.cell.neg.compound.info %>%
mutate(Set = "Cells / Neg") %>%
bind_rows(vpa.cell.pos.compound.info %>% mutate(Set = "Cells / Pos")) %>%
bind_rows(vpa.med.neg.compound.info %>% mutate(Set = "Media / Neg")) %>%
bind_rows(vpa.med.pos.compound.info %>% mutate(Set = "Media / Pos"))
full.vpa.cmpnd %>%
ggplot(aes(x = rt, y = mass)) +
geom_point(size = 3, alpha = 0.3) +
xlab("Retention Time (min)") +
ylab("Mass (Da)") +
ggtitle("Mass v RT\nVPA Only Exp") +
ylim(0, 1100)
full.vpa.cmpnd %>%
ggplot(aes(x = rt, y = mass, color = Set)) +
geom_point(size = 3, alpha = 0.5) +
xlab("Retention Time (min)") +
ylab("Mass (Da)") +
ggtitle("Mass v RT\nVPA Only Exp") +
facet_wrap(~ Set) +
ylim(0, 1100)
The compounds included in the Cells / Neg mode and Cells / Pos mode datasets loook to have a good spread of masses and retention times within the expected RT window of 0 to 29min and mass window of 50 to 1000Da. A smaller number of compounds were detected in the media samples, and the larger molecules that were detected in cells were not detected in media. This is not a surprising result, media in general is expected to have a lower number of compounds than the cells themselves, as cells produce a wide variety of molecules that are not exported to the media.
Q: Which compounds were found in one or more of the data types?
### vpa cell join ###
vpa.cell.cmpnd.join <- vpa.cell.neg.compound.info %>%
inner_join(vpa.cell.pos.compound.info, by = "cas_id", suffix = c(".c.n", ".c.p")) %>%
select(
contains("cas_id"), contains("short"),
contains("full"), starts_with("formula"),
starts_with("mass"), starts_with("rt")
)
# compound names - found in pos and neg mode / cells
print(vpa.cell.cmpnd.join$compound_full.c.n)
[1] "Glycine"
[2] "Acetoin"
[3] "Alanine"
[4] "Sarcosine"
[5] "Beta-Alanine"
[6] "2-Aminobutyric Acid"
[7] "BAIBA"
[8] "Serine"
[9] "Hypotaurine"
[10] "Uracil"
[11] "Creatinine"
[12] "5,6-Dihydrouracil"
[13] "Proline"
[14] "Valine"
[15] "Threonine"
[16] "Taurine"
[17] "4-Oxoproline"
[18] "Ketoleucine"
[19] "trans-4-Hydroxyproline"
[20] "Creatine"
[21] "Isoleucine"
[22] "Leucine"
[23] "Asparagine"
[24] "5-Hydroxyhexanoic Acid"
[25] "Aspartic Acid"
[26] "Adenine"
[27] "Hypoxanthine"
[28] "O-Phosphoethanolamine"
[29] "Glutamine"
[30] "Lysine"
[31] "N-Acetylserine"
[32] "Glutamic Acid"
[33] "Methionine"
[34] "D-Ribose"
[35] "Guanine"
[36] "Xanthine"
[37] "3-Sulfinoalanine"
[38] "Histidine"
[39] "Orotic Acid"
[40] "Allantoin"
[41] "Fucose"
[42] "Phenylalanine"
[43] "Pyridoxal"
[44] "Cysteic Acid"
[45] "Pyridoxine"
[46] "3-Methylhistidine"
[47] "G3P"
[48] "Glycerol 1-Phosphate / Glycerol 3-Phosphate"
[49] "N-Carbamoyl-L-aspartic Acid"
[50] "Tyrosine"
[51] "D-Galactitol"
[52] "Phosphocholine"
[53] "N-alpha-Acetylglutamine"
[54] "Tryptophan"
[55] "Phosphocreatine"
[56] "Glycerylphosphorylethanolamine"
[57] "O-Succinylhomoserine"
[58] "Pantothenic Acid"
[59] "GlcNAc"
[60] "Cystathionine"
[61] "Deoxycytidine"
[62] "Cytidine"
[63] "Uridine"
[64] "Palmitoleic Acid"
[65] "Glycerol 1-phosphoserine"
[66] "D-Glucose 6-phosphate"
[67] "Thiamine"
[68] "Inosine"
[69] "Myoinositol-methyl-phosphate"
[70] "Linoleic Acid"
[71] "Guanosine"
[72] "Xanthosine"
[73] "L-Arginosuccinic Acid"
[74] "Ricinoleic Acid"
[75] "Glutathione"
[76] "11Z,14Z-Eicosadienoic Acid (20:2 n-6)"
[77] "CMP"
[78] "UMP"
[79] "3-Phosphoglyceroinositol"
[80] "AMP"
[81] "GMP"
[82] "SAH"
[83] "CDP"
[84] "ADP"
[85] "GDP"
[86] "LysoPE(18:2)"
[87] "LysoPE(18:0)"
[88] "dTTP"
[89] "CTP"
[90] "UTP"
[91] "CDP-Choline"
[92] "dATP"
[93] "ATP"
[94] "GTP"
[95] "ADP-Ribose"
[96] "UDP-Galactose"
[97] "UDP-Glucose"
[98] "UDP-Glucuronic acid"
[99] "ADP-Glucose"
[100] "GDP-Glucose"
[101] "UDP-GalNAc"
[102] "UDP-GlcNAc"
[103] "GSSG"
[104] "NAD"
[105] "NADH"
[106] "NADP"
[107] "PC(36:4)"
[108] "FAD"
[109] "Acetyl-CoA"
# percent of cell / neg compounds found in cell / pos
round(nrow(vpa.cell.cmpnd.join) * 100 / nrow(vpa.cell.neg.compound.info), 1)
[1] 51.7
# percent of cell / neg compounds found in cell / pos
round(nrow(vpa.cell.cmpnd.join) * 100 / nrow(vpa.cell.pos.compound.info), 1)
[1] 61.9
### vpa media join ###
vpa.med.cmpnd.join <- vpa.med.neg.compound.info %>%
inner_join(vpa.med.pos.compound.info, by = "cas_id", suffix = c(".m.n", ".m.p")) %>%
select(
contains("cas_id"), contains("short"),
contains("full"), starts_with("formula"),
starts_with("mass"), starts_with("rt")
)
# compound names - found in pos and neg mode / media
print(vpa.med.cmpnd.join$compound_full.m.n)
[1] "Alanine" "Serine"
[3] "Uracil" "Valine"
[5] "Threonine" "Taurine"
[7] "Thymine" "4-Oxoproline"
[9] "Isoleucine" "Leucine"
[11] "Glutamine" "Methionine"
[13] "Xanthine" "Histidine"
[15] "Allantoin" "Phenylalanine"
[17] "Uric Acid" "D-Glucose"
[19] "Tyrosine" "D-Galactitol"
[21] "Cysteine-S-sulfonic Acid" "Tryptophan"
[23] "Pantothenic Acid" "Cytidine"
[25] "Uridine" "Inosine"
[27] "Guanosine" "Folic Acid"
# percent of media / neg compounds found in media / pos
round(nrow(vpa.med.cmpnd.join) * 100 / nrow(vpa.med.neg.compound.info), 1)
[1] 51.9
# percent of media / pos compounds found in media / neg
round(nrow(vpa.med.cmpnd.join) * 100 / nrow(vpa.med.pos.compound.info), 1)
[1] 50
### vpa all modes join ###
vpa.all.cmpnd.join <- vpa.cell.cmpnd.join %>%
inner_join(vpa.med.cmpnd.join, by = "cas_id") %>%
select(
contains("cas_id"), contains("short"),
contains("full"), starts_with("formula"),
starts_with("mass"), starts_with("rt")
)
nrow(vpa.all.cmpnd.join)
[1] 23
# check for any mass issues between all 4 modes
vpa.all.cmpnd.join %>%
select(contains("mass")) %>%
ggpairs()
# any rt inconsistencies?
vpa.all.cmpnd.join %>%
select(starts_with("rt")) %>%
ggpairs()
Q: Do any of the samples have greater than 20% missing (NA) compound abundances, out of all of the features in the dataset?
MissingPerSamplePlot(vpa.cell.neg.raw, "VPAnC") +
ggtitle("Missing Per Sample\nVPA-only / Cells / Neg Mode")
MissingPerSamplePlot(vpa.cell.pos.raw, "VPApC") +
ggtitle("Missing Per Sample\nVPA-only / Cells / Pos Mode")
MissingPerSamplePlot(vpa.med.neg.raw, "VPAnM") +
ggtitle("Missing Per Sample\nVPA-only / Media / Neg Mode")
MissingPerSamplePlot(vpa.med.pos.raw, "VPApM") +
ggtitle("Missing Per Sample\nVPA-only / Media / Pos Mode")
A: No, sample files across both datasets have very few missing values. The green-colored bars, marked “sample” are the actual experimental samples. Whereas the “solv”, or “solvent”, and “empty” samples are negative control samples that are expected to have many missing values and/or low compound abundances. They will be used to narrow down the list of features later on in the analysis.
Q: Are any of the compounds more than 20% missing in the experimental sample group? If there are any, they will be excluded from analysis.
Note: The MissingPerCompound function considers only the Group == "sample" samples.
MissingPerCompound(vpa.cell.neg.raw, "VPAnC") %>%
filter(percent_na > 20)
# A tibble: 0 x 4
# ... with 4 variables: Compound <chr>, na_count <int>, n_samples <int>,
# percent_na <dbl>
(vpa.cell.pos.cmpnd.to.excl<- MissingPerCompound(vpa.cell.pos.raw, "VPApC") %>%
filter(percent_na > 20))
# A tibble: 1 x 4
Compound na_count n_samples percent_na
<chr> <int> <int> <dbl>
1 VPApC110 9 23 39.1
MissingPerCompound(vpa.med.neg.raw, "VPAnM") %>%
filter(percent_na > 20)
# A tibble: 0 x 4
# ... with 4 variables: Compound <chr>, na_count <int>, n_samples <int>,
# percent_na <dbl>
(vpa.med.pos.cmpnd.to.excl <- MissingPerCompound(vpa.med.pos.raw, "VPApM") %>%
filter(percent_na > 20))
# A tibble: 1 x 4
Compound na_count n_samples percent_na
<chr> <int> <int> <dbl>
1 VPApM34 6 24 25
A: No compounds need to be excluded from the Cell / Neg Mode and Media / Neg Mode datasets, but 1 compound each will be excluded from the Cell / Pos Mode and Media / Pos Mode datasets.
# get the mean abundance of each compound, grouped by solvent vs empty sample vs experimental sample
vpa.cell.neg.raw.grp.mean <- vpa.cell.neg.raw %>%
group_by(Group) %>%
summarize_at(vars(matches("VPAnC")), mean, na.rm = TRUE) %>%
gather(key = "Compound", value = "Grp_mean_abun", -Group)
# plot the log2 density distribution of the means
vpa.cell.neg.raw.grp.mean %>%
ggplot(aes(log2(Grp_mean_abun), color = Group)) +
geom_density(size = 2, alpha = 0.8) +
ggtitle("Distribution of compound means\nVPA-only / Cells / Negative Mode\nGrouped by sample type")
# for plotting purposes, to make the plot neater
vpa.cell.neg.raw.grp.mean.order <- vpa.cell.neg.raw.grp.mean %>%
filter(Group == "sample") %>%
arrange(Grp_mean_abun)
vpa.cell.neg.raw %>%
select(Samples, Group, starts_with("VPAnC")) %>%
gather("Compound", value = "Abundance", -c(Samples, Group)) %>%
mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.neg.raw.grp.mean.order$Compound)) %>%
ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) +
geom_line(alpha = 0.1, size = 1) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
xlab("Compound") +
# overlay the group averages
geom_line(
data = vpa.cell.neg.raw.grp.mean %>%
mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.neg.raw.grp.mean.order$Compound)),
aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
size = 0.5
) +
ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA-only / Cells / Negative Mode")
# compound mean by group only, ordered by increasing abundance in the experimental samples
vpa.cell.neg.raw.grp.mean %>%
mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.neg.raw.grp.mean.order$Compound)) %>%
ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
geom_point(size = 1, alpha = 0.8) +
geom_line(alpha = 0.8) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
xlab("Compound") +
ylab("log2(Sample Type Mean)") +
ggtitle("Profile Plot of compound means by sample type only\nVPA-only / Cells / Negative Mode")
vpa.cell.neg.raw.grp.diff <- vpa.cell.neg.raw.grp.mean %>%
spread(Group, Grp_mean_abun) %>%
mutate(
smpl_empty_diff = sample / empty,
smpl_solv_diff = sample / solv,
empty_solv_diff = empty / solv
)
vpa.cell.neg.raw.grp.diff %>%
ggplot(aes(log2(smpl_empty_diff))) +
geom_histogram(bins = 50)
vpa.cell.neg.raw.grp.diff %>%
ggplot(aes(log2(smpl_solv_diff))) +
geom_histogram(bins = 50)
vpa.cell.neg.raw.grp.diff %>%
ggplot(aes(log2(smpl_empty_diff), log2(smpl_solv_diff))) +
geom_point(size = 2, alpha = 0.5) +
xlim(-3, 13.5) +
ylim(-3, 13.5) +
# abline in pink
geom_abline(intercept = 0, slope = 1, color = "#CC79A7", size = 2, alpha = 0.6) +
# lm line in green
geom_smooth(method = "lm", color = "#009E73", size = 2, alpha = 0.6, se = FALSE) +
xlab("log2([Sample Compound Mean] / [Empty Compound Mean])") +
ylab("log2([Sample Compound Mean] / [Solvent Compound Mean])") +
ggtitle("Background signal\nRaw Data / Cells / Neg Mode")
# include compounds with FC > 2.5 or FC is NA (indication of NA in solv or empty)
vpa.cell.neg.cmpnd.to.incl <- vpa.cell.neg.raw.grp.diff %>%
filter((smpl_solv_diff > 2.5 & smpl_empty_diff > 2.5) | is.na(smpl_solv_diff) | is.na(smpl_empty_diff))
# how many compound were there in the raw negative mode dataset?
nrow(vpa.cell.neg.raw.grp.diff)
[1] 211
# how many compounds are retained for further analysis?
nrow(vpa.cell.neg.cmpnd.to.incl)
[1] 179
vpa.cell.pos.raw.grp.mean <- vpa.cell.pos.raw %>%
group_by(Group) %>%
summarize_at(vars(matches("VPApC")), mean, na.rm = TRUE) %>%
gather(key = "Compound", value = "Grp_mean_abun", -Group)
vpa.cell.pos.raw.grp.mean %>%
ggplot(aes(log2(Grp_mean_abun), color = Group)) +
geom_density(size = 2, alpha = 0.8) +
ggtitle("Distribution of compound means\nVPA-only / Cells / Positive Mode\nGrouped by sample type")
vpa.cell.pos.raw.grp.mean.order <- vpa.cell.pos.raw.grp.mean %>%
filter(Group == "sample") %>%
arrange(Grp_mean_abun)
vpa.cell.pos.raw %>%
select(Samples, Group, starts_with("VPApC")) %>%
gather("Compound", value = "Abundance", -c(Samples, Group)) %>%
mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.pos.raw.grp.mean.order$Compound)) %>%
ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) +
geom_line(alpha = 0.1, size = 1) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
xlab("Compound") +
# overlay the group averages
geom_line(
data = vpa.cell.pos.raw.grp.mean %>%
mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.pos.raw.grp.mean.order$Compound)),
aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
size = 0.5
) +
ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA-only / Cells / Positive Mode")
vpa.cell.pos.raw.grp.mean %>%
mutate(Cmpnd_sort = factor(Compound, levels = vpa.cell.pos.raw.grp.mean.order$Compound)) %>%
ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
geom_point(size = 1, alpha = 0.8) +
geom_line(alpha = 0.8) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
xlab("Compound") +
ylab("log2(Sample Type Mean)") +
ggtitle("Profile Plot of compound means by sample type only\nVPA-only / Cells / Positive Mode")
vpa.cell.pos.raw.grp.diff <- vpa.cell.pos.raw.grp.mean %>%
spread(Group, Grp_mean_abun) %>%
mutate(
smpl_empty_diff = sample / empty,
smpl_solv_diff = sample / solv,
empty_solv_diff = empty / solv
)
vpa.cell.pos.raw.grp.diff %>%
ggplot(aes(log2(smpl_empty_diff))) +
geom_histogram(bins = 50)
vpa.cell.pos.raw.grp.diff %>%
ggplot(aes(log2(smpl_solv_diff))) +
geom_histogram(bins = 50)
vpa.cell.pos.raw.grp.diff %>%
ggplot(aes(log2(smpl_empty_diff), log2(smpl_solv_diff))) +
geom_point(size = 2, alpha = 0.5) +
xlim(-1, 15) +
ylim(-1, 15) +
geom_abline(intercept = 0, slope = 1, color = "#CC79A7", size = 2, alpha = 0.6) +
geom_smooth(method = "lm", color = "#009E73", size = 2, alpha = 0.6, se = FALSE) +
xlab("log2([Sample Compound Mean] / [Empty Compound Mean])") +
ylab("log2([Sample Compound Mean] / [Solvent Compound Mean])") +
ggtitle("Background signal\nRaw Data / Cells / Pos Mode")
# include compounds with FC > 2.5 or FC is NA (indication of NA in solv or empty)
vpa.cell.pos.cmpnd.to.incl <- vpa.cell.pos.raw.grp.diff %>%
filter((smpl_solv_diff > 2.5 & smpl_empty_diff > 2.5) | is.na(smpl_solv_diff) | is.na(smpl_empty_diff)) %>%
filter(!(Compound %in% vpa.cell.pos.cmpnd.to.excl$Compound))
# how many compound were there in the raw dataset?
nrow(vpa.cell.pos.raw.grp.diff)
[1] 176
# how many compounds are retained for further analysis?
nrow(vpa.cell.pos.cmpnd.to.incl)
[1] 142
vpa.med.neg.raw.grp.mean <- vpa.med.neg.raw %>%
group_by(Group) %>%
summarize_at(vars(matches("VPAnM")), mean, na.rm = TRUE) %>%
gather(key = "Compound", value = "Grp_mean_abun", -Group)
vpa.med.neg.raw.grp.mean %>%
ggplot(aes(log2(Grp_mean_abun), color = Group)) +
geom_density(size = 2, alpha = 0.8) +
ggtitle("Distribution of compound means\nVPA-only / Media / Negative Mode\nGrouped by sample type")
vpa.med.neg.raw.grp.mean.order <- vpa.med.neg.raw.grp.mean %>%
filter(Group == "empty") %>%
arrange(Grp_mean_abun)
vpa.med.neg.raw %>%
select(Samples, Group, starts_with("VPAnM")) %>%
gather("Compound", value = "Abundance", -c(Samples, Group)) %>%
mutate(Cmpnd_sort = factor(Compound, levels = vpa.med.neg.raw.grp.mean.order$Compound)) %>%
ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) +
geom_line(alpha = 0.1, size = 1.5) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
xlab("Compound") +
# overlay the group averages
geom_line(
data = vpa.med.neg.raw.grp.mean %>%
mutate(Cmpnd_sort = factor(Compound, levels = vpa.med.neg.raw.grp.mean.order$Compound)),
aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
size = 0.5
) +
ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA-only / Media / Negative Mode")
vpa.med.neg.raw.grp.mean %>%
mutate(Cmpnd_sort = factor(Compound, levels = vpa.med.neg.raw.grp.mean.order$Compound)) %>%
ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
geom_point(size = 1, alpha = 0.8) +
geom_line(alpha = 0.8) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
xlab("Compound") +
ylab("log2(Sample Type Mean)")
vpa.med.neg.raw.grp.diff <- vpa.med.neg.raw.grp.mean %>%
spread(Group, Grp_mean_abun) %>%
mutate(
smpl_empty_diff = sample / empty,
smpl_solv_diff = sample / solv,
empty_solv_diff = empty / solv
)
vpa.med.neg.raw.grp.diff %>%
ggplot(aes(log2(smpl_empty_diff))) +
geom_histogram(bins = 25)
vpa.med.neg.raw.grp.diff %>%
ggplot(aes(log2(smpl_solv_diff))) +
geom_histogram(bins = 25)
vpa.med.neg.raw.grp.diff %>%
ggplot(aes(log2(smpl_empty_diff), log2(smpl_solv_diff))) +
geom_point(size = 2, alpha = 0.5) +
xlim(-4, 4) +
ylim(-1, 4) +
# abline in pink
geom_abline(intercept = 0, slope = 1, color = "#CC79A7", size = 2, alpha = 0.6) +
# lm line in green
geom_smooth(method = "lm", color = "#009E73", size = 2, alpha = 0.6, se = FALSE) +
xlab("log2([Sample Compound Mean] / [Empty Compound Mean])") +
ylab("log2([Sample Compound Mean] / [Solvent Compound Mean])") +
ggtitle("Background signal\nRaw Data / Media / Neg Mode")
# include compounds with FC > 2.5 or FC is NA (indication of NA in solv)
vpa.med.neg.cmpnd.to.incl <- vpa.med.neg.raw.grp.diff %>%
filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff))
# how many compound were there in the raw dataset?
nrow(vpa.med.neg.raw.grp.diff)
[1] 54
# how many compounds are retained for further analysis?
nrow(vpa.med.neg.cmpnd.to.incl)
[1] 41
vpa.med.pos.raw.grp.mean <- vpa.med.pos.raw %>%
group_by(Group) %>%
summarize_at(vars(matches("VPApM")), mean, na.rm = TRUE) %>%
gather(key = "Compound", value = "Grp_mean_abun", -Group)
vpa.med.pos.raw.grp.mean %>%
ggplot(aes(log2(Grp_mean_abun), color = Group)) +
geom_density(size = 2, alpha = 0.8) +
ggtitle("Distribution of compound means\nVPA-only / Media / Positive Mode\nGrouped by sample type")
vpa.med.pos.raw.grp.mean.order <- vpa.med.pos.raw.grp.mean %>%
filter(Group == "empty") %>%
arrange(Grp_mean_abun)
vpa.med.pos.raw %>%
select(Samples, Group, starts_with("VPApM")) %>%
gather("Compound", value = "Abundance", -c(Samples, Group)) %>%
mutate(Cmpnd_sort = factor(Compound, levels = vpa.med.pos.raw.grp.mean.order$Compound)) %>%
ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) +
geom_line(alpha = 0.1, size = 1.5) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
xlab("Compound") +
# overlay the group averages
geom_line(
data = vpa.med.pos.raw.grp.mean %>%
mutate(Cmpnd_sort = factor(Compound, levels = vpa.med.pos.raw.grp.mean.order$Compound)),
aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
size = 0.5
) +
ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nVPA-only / Media / Positive Mode")
vpa.med.pos.raw.grp.mean %>%
mutate(Cmpnd_sort = factor(Compound, levels = vpa.med.pos.raw.grp.mean.order$Compound)) %>%
ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
geom_point(size = 1, alpha = 0.8) +
geom_line(alpha = 0.8) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
xlab("Compound") +
ylab("log2(Sample Type Mean)")
vpa.med.pos.raw.grp.diff <- vpa.med.pos.raw.grp.mean %>%
spread(Group, Grp_mean_abun) %>%
mutate(
smpl_empty_diff = sample / empty,
smpl_solv_diff = sample / solv,
empty_solv_diff = empty / solv
)
vpa.med.pos.raw.grp.diff %>%
ggplot(aes(log2(smpl_empty_diff))) +
geom_histogram(bins = 25)
vpa.med.pos.raw.grp.diff %>%
ggplot(aes(log2(smpl_solv_diff))) +
geom_histogram(bins = 25)
vpa.med.pos.raw.grp.diff %>%
ggplot(aes(log2(smpl_empty_diff), log2(smpl_solv_diff))) +
geom_point(size = 2, alpha = 0.5) +
xlim(-6, 2) +
ylim(-1, 7) +
# abline in pink
geom_abline(intercept = 0, slope = 1, color = "#CC79A7", size = 2, alpha = 0.6) +
# lm line in green
geom_smooth(method = "lm", color = "#009E73", size = 2, alpha = 0.6, se = FALSE) +
xlab("log2([Sample Compound Mean] / [Empty Compound Mean])") +
ylab("log2([Sample Compound Mean] / [Solvent Compound Mean])") +
ggtitle("Background signal\nRaw Data / Media / pos Mode")
# include compounds with FC > 2.5 or FC is NA (indication of NA in solv)
vpa.med.pos.cmpnd.to.incl <- vpa.med.pos.raw.grp.diff %>%
filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff)) %>%
filter(!(Compound %in% vpa.med.pos.cmpnd.to.excl$Compound))
# how many compound were there in the raw dataset?
nrow(vpa.med.pos.raw.grp.diff)
[1] 56
# how many compounds are retained for further analysis?
nrow(vpa.med.pos.cmpnd.to.incl)
[1] 36
# replace missing with minimum for each Group in each dataset and log2() transform the data:
# exclude compound that have a >20% NA count across samples
vpa.cell.neg.noNA <- vpa.cell.neg.raw %>%
select(Samples:Experiment, one_of(vpa.cell.neg.cmpnd.to.incl$Compound)) %>%
ReplaceNAwMinLogTransformSingle("VPAnC")
vpa.cell.pos.noNA <- vpa.cell.pos.raw %>%
select(Samples:Experiment, one_of(vpa.cell.pos.cmpnd.to.incl$Compound)) %>%
ReplaceNAwMinLogTransformSingle("VPApC")
vpa.med.neg.noNA <- vpa.med.neg.raw %>%
select(Samples:Experiment, one_of(vpa.med.neg.cmpnd.to.incl$Compound)) %>%
ReplaceNAwMinLogTransformSingle("VPAnM")
vpa.med.pos.noNA <- vpa.med.pos.raw %>%
select(Samples:Experiment, one_of(vpa.med.pos.cmpnd.to.incl$Compound)) %>%
ReplaceNAwMinLogTransformSingle("VPApM")
vpa.cell.neg.noNA.gathered <- vpa.cell.neg.noNA %>%
gather(
key = "Metabolite", "Abundance",
which(colnames(vpa.cell.neg.noNA) == "VPAnC1"):ncol(vpa.cell.neg.noNA)
)
# plot all abundances in a sample, grouped by sample as a boxplot
vpa.cell.neg.noNA.gathered %>%
ggplot(aes(Samples, Abundance, fill = Group)) +
geom_boxplot() +
geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
theme(axis.text.x = element_text(angle = 90)) +
ylab("log2(Abundance)") +
ggtitle("Boxplot of compound abundances\nAll samples\nVPA-only / Cells / Negative Mode")
# same data format, but as ridge plots
vpa.cell.neg.noNA.gathered %>%
ggplot(aes(y = Samples, x = Abundance, fill = Group)) +
geom_density_ridges(scale = 15) +
theme_ridges() +
scale_y_discrete(expand = c(0.01, 0)) +
ggtitle("Ridge plot of compound abundances\nAll samples\nVPA-only / Cells / Negative Mode")
# experimental samples only
vpa.cell.neg.noNA.gathered %>%
filter(Group == "sample") %>%
ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) +
geom_density_ridges(scale = 10) +
theme_ridges() +
scale_y_discrete(expand = c(0.01, 0)) +
ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA-only / Cells / Negative Mode")
# overlay the distributions for another look
vpa.cell.neg.noNA.gathered %>%
filter(Group == "sample") %>%
ggplot(aes(Abundance, group = Samples, color = Treatment)) +
geom_density(alpha = 0.8, size = 0.75) +
xlab("log2(Abundance)") +
ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA-only / Cells / Negative Mode")
vpa.cell.pos.noNA.gathered <- vpa.cell.pos.noNA %>%
gather(
key = "Metabolite", "Abundance",
which(colnames(vpa.cell.pos.noNA) == "VPApC1"):ncol(vpa.cell.pos.noNA)
)
# plot all abundances in a sample, grouped by sample as a boxplot
vpa.cell.pos.noNA.gathered %>%
ggplot(aes(Samples, Abundance, fill = Group)) +
geom_boxplot() +
geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
theme(axis.text.x = element_text(angle = 90)) +
ylab("log2(Abundance)") +
ggtitle("Boxplot of compound abundances\nAll samples\nVPA-only / Cells / Positive Mode")
# same data format, but as ridge plots
vpa.cell.pos.noNA.gathered %>%
ggplot(aes(y = Samples, x = Abundance, fill = Group)) +
geom_density_ridges(scale = 15) +
theme_ridges() +
scale_y_discrete(expand = c(0.01, 0)) +
ggtitle("Ridge plot of compound abundances\nAll samples\nVPA-only / Cells / Positive Mode")
# experimental samples only
vpa.cell.pos.noNA.gathered %>%
filter(Group == "sample") %>%
ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) +
geom_density_ridges(scale = 10) +
theme_ridges() +
scale_y_discrete(expand = c(0.01, 0)) +
ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA-only / Cells / Positive Mode")
# overlay the distributions for another look
vpa.cell.pos.noNA.gathered %>%
filter(Group == "sample") %>%
ggplot(aes(Abundance, group = Samples, color = Treatment)) +
geom_density(alpha = 0.8, size = 0.75) +
xlab("log2(Abundance)") +
ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA-only / Cells / Positive Mode")
vpa.med.neg.noNA.gathered <- vpa.med.neg.noNA %>%
gather(
key = "Metabolite", "Abundance",
which(colnames(vpa.med.neg.noNA) == "VPAnM1"):ncol(vpa.med.neg.noNA)
)
# plot all abundances in a sample, grouped by sample as a boxplot
vpa.med.neg.noNA.gathered %>%
ggplot(aes(Samples, Abundance, fill = Group)) +
geom_boxplot() +
geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
theme(axis.text.x = element_text(angle = 90)) +
ylab("log2(Abundance)") +
ggtitle("Boxplot of compound abundances\nAll samples\nVPA-only / Media / Negative Mode")
# same data format, but as ridge plots
vpa.med.neg.noNA.gathered %>%
ggplot(aes(y = Samples, x = Abundance, fill = Group)) +
geom_density_ridges(scale = 15) +
theme_ridges() +
scale_y_discrete(expand = c(0.01, 0)) +
ggtitle("Ridge plot of compound abundances\nAll samples\nVPA-only / Media / Negative Mode")
# experimental samples only
vpa.med.neg.noNA.gathered %>%
filter(Group == "sample") %>%
ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) +
geom_density_ridges(scale = 10) +
theme_ridges() +
scale_y_discrete(expand = c(0.01, 0)) +
ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA-only / Media / Negative Mode")
# overlay the distributions for another look
vpa.med.neg.noNA.gathered %>%
filter(Group == "sample") %>%
ggplot(aes(Abundance, group = Samples, color = Treatment)) +
geom_density(alpha = 0.8, size = 0.75) +
xlab("log2(Abundance)") +
ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA-only / Media / Negative Mode")
vpa.med.pos.noNA.gathered <- vpa.med.pos.noNA %>%
gather(
key = "Metabolite", "Abundance",
which(colnames(vpa.med.pos.noNA) == "VPApM1"):ncol(vpa.med.pos.noNA)
)
# plot all abundances in a sample, grouped by sample as a boxplot
vpa.med.pos.noNA.gathered %>%
ggplot(aes(Samples, Abundance, fill = Group)) +
geom_boxplot() +
geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
theme(axis.text.x = element_text(angle = 90)) +
ylab("log2(Abundance)") +
ggtitle("Boxplot of compound abundances\nAll samples\nVPA-only / Media / Positive Mode")
# same data format, but as ridge plots
vpa.med.pos.noNA.gathered %>%
ggplot(aes(y = Samples, x = Abundance, fill = Group)) +
geom_density_ridges(scale = 15) +
theme_ridges() +
scale_y_discrete(expand = c(0.01, 0)) +
ggtitle("Ridge plot of compound abundances\nAll samples\nVPA-only / Media / Positive Mode")
# experimental samples only
vpa.med.pos.noNA.gathered %>%
filter(Group == "sample") %>%
ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) +
geom_density_ridges(scale = 10) +
theme_ridges() +
scale_y_discrete(expand = c(0.01, 0)) +
ggtitle("Ridge plot of compound abundances\nExperimental samples only\nVPA-only / Media / Positive Mode")
# overlay the distributions for another look
vpa.med.pos.noNA.gathered %>%
filter(Group == "sample") %>%
ggplot(aes(Abundance, group = Samples, color = Treatment)) +
geom_density(alpha = 0.8, size = 0.75) +
xlab("log2(Abundance)") +
ggtitle("Density plot of compound abundances\nExperimental samples only\nVPA-only / Media / Positive Mode")
Some plots to understand the relationship between the samples, QC samples, solvent, and empty samples in some cases.
### PCA on all Samples ###
vpa.cell.neg.full.pca <- vpa.cell.neg.noNA %>%
select(starts_with("VPAnC")) %>%
# good idea to center data before pca, but scaling should not be necessary
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
# plot variance explained by each new principal component
plot(
(vpa.cell.neg.full.pca$sdev ^ 2) * 100 / sum(vpa.cell.neg.full.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nAll samples only\nVPA-only / Cells / Negative Mode",
type = "b"
)
vpa.cell.neg.full.pca.x <- as.data.frame(vpa.cell.neg.full.pca$x)
row.names(vpa.cell.neg.full.pca.x) <- vpa.cell.neg.noNA$Samples
vpa.cell.neg.full.pca.x <- vpa.cell.neg.full.pca.x %>%
bind_cols(vpa.cell.neg.noNA %>% select(Group:Experiment))
vpa.cell.neg.full.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Group)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (90.5% Var)") +
ylab("PC2 (3.1%)") +
ggtitle("Principal Component Analysis\nAll Samples\nVPA-only / Cells / Negative Mode")
### Samples and QC ###
vpa.cell.neg.smpl.QC.pca <- vpa.cell.neg.noNA %>%
filter(Group == "sample" | Group == "QC") %>%
select(starts_with("VPAnC")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
plot(
(vpa.cell.neg.smpl.QC.pca$sdev ^ 2) * 100 / sum(vpa.cell.neg.smpl.QC.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nSamples and QC\nVPA-only / Cells / Negative Mode",
type = "b"
)
vpa.cell.neg.smpl.QC.pca.x <- as.data.frame(vpa.cell.neg.smpl.QC.pca$x)
vpa.cell.neg.smpl.QC.pca.x <- vpa.cell.neg.smpl.QC.pca.x %>%
bind_cols(
vpa.cell.neg.noNA %>%
filter(Group == "sample" | Group == "QC") %>%
select(Samples, Group:Experiment)
)
row.names(vpa.cell.neg.smpl.QC.pca.x) <- vpa.cell.neg.smpl.QC.pca.x$Samples
vpa.cell.neg.smpl.QC.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (35.1% Var)") +
ylab("PC2 (19.7%)") +
ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Cells / Negative Mode")
vpa.cell.neg.smpl.QC.pca.x %>%
ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC3 (16.1% Var)") +
ylab("PC4 (6.4%)") +
ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Cells / Negative Mode")
### Experimental Samples Only ###
vpa.cell.neg.smpl.pca <- vpa.cell.neg.noNA %>%
filter(Group == "sample") %>%
select(starts_with("VPAnC")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
plot(
(vpa.cell.neg.smpl.pca$sdev ^ 2) * 100 / sum(vpa.cell.neg.smpl.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nExperimental samples only\nVPA-only / Cells / Negative Mode",
type = "b"
)
vpa.cell.neg.smpl.pca.x <- as.data.frame(vpa.cell.neg.smpl.pca$x)
vpa.cell.neg.smpl.pca.x <- vpa.cell.neg.smpl.pca.x %>%
bind_cols(
vpa.cell.neg.noNA %>%
filter(Group == "sample") %>%
select(Samples, Group:Experiment)
)
row.names(vpa.cell.neg.smpl.pca.x) <- vpa.cell.neg.smpl.pca.x$Samples
vpa.cell.neg.smpl.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (34.3% Var)") +
ylab("PC2 (23.8%)") +
ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Cells / Negative Mode")
vpa.cell.neg.smpl.pca.x %>%
ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC3 (16.6% Var)") +
ylab("PC4 (7.1%)") +
ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Cells / Negative Mode")
### PCA on all Samples ###
vpa.cell.pos.full.pca <- vpa.cell.pos.noNA %>%
select(starts_with("VPApC")) %>%
# good idea to center data before pca, but scaling should not be necessary
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
# plot variance explained by each new principal component
plot(
(vpa.cell.pos.full.pca$sdev ^ 2) * 100 / sum(vpa.cell.pos.full.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nAll samples only\nVPA-only / Cells / Positive Mode",
type = "b"
)
vpa.cell.pos.full.pca.x <- as.data.frame(vpa.cell.pos.full.pca$x)
row.names(vpa.cell.pos.full.pca.x) <- vpa.cell.pos.noNA$Samples
vpa.cell.pos.full.pca.x <- vpa.cell.pos.full.pca.x %>%
bind_cols(vpa.cell.pos.noNA %>% select(Group:Experiment))
vpa.cell.pos.full.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Group)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (90.0% Var)") +
ylab("PC2 (4.2%)") +
ggtitle("Principal Component Analysis\nAll Samples\nVPA-only / Cells / Positive Mode")
### Samples and QC ###
vpa.cell.pos.smpl.QC.pca <- vpa.cell.pos.noNA %>%
filter(Group == "sample" | Group == "QC") %>%
select(starts_with("VPApC")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
plot(
(vpa.cell.pos.smpl.QC.pca$sdev ^ 2) * 100 / sum(vpa.cell.pos.smpl.QC.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nSamples and QC\nVPA-only / Cells / Positive Mode",
type = "b"
)
vpa.cell.pos.smpl.QC.pca.x <- as.data.frame(vpa.cell.pos.smpl.QC.pca$x)
vpa.cell.pos.smpl.QC.pca.x <- vpa.cell.pos.smpl.QC.pca.x %>%
bind_cols(
vpa.cell.pos.noNA %>%
filter(Group == "sample" | Group == "QC") %>%
select(Samples, Group:Experiment)
)
row.names(vpa.cell.pos.smpl.QC.pca.x) <- vpa.cell.pos.smpl.QC.pca.x$Samples
vpa.cell.pos.smpl.QC.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (55.0% Var)") +
ylab("PC2 (15.0%)") +
ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Cells / Positive Mode")
vpa.cell.pos.smpl.QC.pca.x %>%
ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC3 (11.4% Var)") +
ylab("PC4 (5.0%)") +
ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Cells / Positive Mode")
### Experimental Samples Only ###
vpa.cell.pos.smpl.pca <- vpa.cell.pos.noNA %>%
filter(Group == "sample") %>%
select(starts_with("VPApC")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
plot(
(vpa.cell.pos.smpl.pca$sdev ^ 2) * 100 / sum(vpa.cell.pos.smpl.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nExperimental samples only\nVPA-only / Cells / Positive Mode",
type = "b"
)
vpa.cell.pos.smpl.pca.x <- as.data.frame(vpa.cell.pos.smpl.pca$x)
vpa.cell.pos.smpl.pca.x <- vpa.cell.pos.smpl.pca.x %>%
bind_cols(
vpa.cell.pos.noNA %>%
filter(Group == "sample") %>%
select(Samples, Group:Experiment)
)
row.names(vpa.cell.pos.smpl.pca.x) <- vpa.cell.pos.smpl.pca.x$Samples
vpa.cell.pos.smpl.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (57.5% Var)") +
ylab("PC2 (19.6%)") +
ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Cells / Positive Mode")
vpa.cell.pos.smpl.pca.x %>%
ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC3 (7.2% Var)") +
ylab("PC4 (4.7%)") +
ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Cells / Positive Mode")
### PCA on all Samples ###
vpa.med.neg.full.pca <- vpa.med.neg.noNA %>%
select(starts_with("VPAnM")) %>%
# good idea to center data before pca, but scaling should not be necessary
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
# plot variance explained by each new principal component
plot(
(vpa.med.neg.full.pca$sdev ^ 2) * 100 / sum(vpa.med.neg.full.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nAll samples only\nVPA-only / Media / Negative Mode",
type = "b"
)
vpa.med.neg.full.pca.x <- as.data.frame(vpa.med.neg.full.pca$x)
row.names(vpa.med.neg.full.pca.x) <- vpa.med.neg.noNA$Samples
vpa.med.neg.full.pca.x <- vpa.med.neg.full.pca.x %>%
bind_cols(vpa.med.neg.noNA %>% select(Group:Experiment))
vpa.med.neg.full.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Group, shape = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (75.8% Var)") +
ylab("PC2 (8.4%)") +
ggtitle("Principal Component Analysis\nAll Samples\nVPA-only / Media / Negative Mode")
### Samples and QC ###
vpa.med.neg.smpl.QC.pca <- vpa.med.neg.noNA %>%
filter(Group == "sample" | Group == "QC") %>%
select(starts_with("VPAnM")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
plot(
(vpa.med.neg.smpl.QC.pca$sdev ^ 2) * 100 / sum(vpa.med.neg.smpl.QC.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nSamples and QC\nVPA-only / Media / Negative Mode",
type = "b"
)
vpa.med.neg.smpl.QC.pca.x <- as.data.frame(vpa.med.neg.smpl.QC.pca$x)
vpa.med.neg.smpl.QC.pca.x <- vpa.med.neg.smpl.QC.pca.x %>%
bind_cols(
vpa.med.neg.noNA %>%
filter(Group == "sample" | Group == "QC") %>%
select(Samples, Group:Experiment)
)
row.names(vpa.med.neg.smpl.QC.pca.x) <- vpa.med.neg.smpl.QC.pca.x$Samples
vpa.med.neg.smpl.QC.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (56.8% Var)") +
ylab("PC2 (25.0%)") +
ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Media / Negative Mode")
vpa.med.neg.smpl.QC.pca.x %>%
ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC3 (9.0% Var)") +
ylab("PC4 (4.1%)") +
ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Media / Negative Mode")
### Experimental Samples Only ###
vpa.med.neg.smpl.pca <- vpa.med.neg.noNA %>%
filter(Group == "sample") %>%
select(starts_with("VPAnM")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
plot(
(vpa.med.neg.smpl.pca$sdev ^ 2) * 100 / sum(vpa.med.neg.smpl.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nExperimental samples only\nVPA-only / Media / Negative Mode",
type = "b"
)
vpa.med.neg.smpl.pca.x <- as.data.frame(vpa.med.neg.smpl.pca$x)
vpa.med.neg.smpl.pca.x <- vpa.med.neg.smpl.pca.x %>%
bind_cols(
vpa.med.neg.noNA %>%
filter(Group == "sample") %>%
select(Samples, Group:Experiment)
)
row.names(vpa.med.neg.smpl.pca.x) <- vpa.med.neg.smpl.pca.x$Samples
vpa.med.neg.smpl.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (58.2% Var)") +
ylab("PC2 (27.5%)") +
ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Media / Negative Mode")
vpa.med.neg.smpl.pca.x %>%
ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC3 (8.3% Var)") +
ylab("PC4 (2.4%)") +
ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Media / Negative Mode")
### PCA on all Samples ###
vpa.med.pos.full.pca <- vpa.med.pos.noNA %>%
select(starts_with("VPApM")) %>%
# good idea to center data before pca, but scaling should not be necessary
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
# plot variance explained by each new principal component
plot(
(vpa.med.pos.full.pca$sdev ^ 2) * 100 / sum(vpa.med.pos.full.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nAll samples only\nVPA-only / Media / Positive Mode",
type = "b"
)
vpa.med.pos.full.pca.x <- as.data.frame(vpa.med.pos.full.pca$x)
row.names(vpa.med.pos.full.pca.x) <- vpa.med.pos.noNA$Samples
vpa.med.pos.full.pca.x <- vpa.med.pos.full.pca.x %>%
bind_cols(vpa.med.pos.noNA %>% select(Group:Experiment))
vpa.med.pos.full.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Group, shape = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (85.6% Var)") +
ylab("PC2 (6.3%)") +
ggtitle("Principal Component Analysis\nAll Samples\nVPA-only / Media / Positive Mode")
### Samples and QC ###
vpa.med.pos.smpl.QC.pca <- vpa.med.pos.noNA %>%
filter(Group == "sample" | Group == "QC") %>%
select(starts_with("VPApM")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
plot(
(vpa.med.pos.smpl.QC.pca$sdev ^ 2) * 100 / sum(vpa.med.pos.smpl.QC.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nSamples and QC\nVPA-only / Media / Positive Mode",
type = "b"
)
vpa.med.pos.smpl.QC.pca.x <- as.data.frame(vpa.med.pos.smpl.QC.pca$x)
vpa.med.pos.smpl.QC.pca.x <- vpa.med.pos.smpl.QC.pca.x %>%
bind_cols(
vpa.med.pos.noNA %>%
filter(Group == "sample" | Group == "QC") %>%
select(Samples, Group:Experiment)
)
row.names(vpa.med.pos.smpl.QC.pca.x) <- vpa.med.pos.smpl.QC.pca.x$Samples
vpa.med.pos.smpl.QC.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (72.1% Var)") +
ylab("PC2 (15.6%)") +
ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Media / Positive Mode")
vpa.med.pos.smpl.QC.pca.x %>%
ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC3 (6.0% Var)") +
ylab("PC4 (2.5%)") +
ggtitle("Principal Component Analysis\nSamples and QC\nVPA-only / Media / Positive Mode")
### Experimental Samples Only ###
vpa.med.pos.smpl.pca <- vpa.med.pos.noNA %>%
filter(Group == "sample") %>%
select(starts_with("VPApM")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
plot(
(vpa.med.pos.smpl.pca$sdev ^ 2) * 100 / sum(vpa.med.pos.smpl.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nExperimental samples only\nVPA-only / Media / Positive Mode",
type = "b"
)
vpa.med.pos.smpl.pca.x <- as.data.frame(vpa.med.pos.smpl.pca$x)
vpa.med.pos.smpl.pca.x <- vpa.med.pos.smpl.pca.x %>%
bind_cols(
vpa.med.pos.noNA %>%
filter(Group == "sample") %>%
select(Samples, Group:Experiment)
)
row.names(vpa.med.pos.smpl.pca.x) <- vpa.med.pos.smpl.pca.x$Samples
vpa.med.pos.smpl.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (80.9% Var)") +
ylab("PC2 (11.2%)") +
ggtitle("Principal Component Analysis\nExperimental samples only\nVPA-only / Media / Positive Mode")
Is VPA metabolised by cells?
vpa.med.neg.noNA %>%
select(Samples:Experiment, VPAnM24) %>%
unite(Sample_Type, Group:Treatment, sep = "_") %>%
ggplot(aes(Sample_Type, VPAnM24)) +
geom_jitter(alpha = 0.8, width = 0.1)
vpa.med.neg.noNA %>%
select(Samples:Experiment, VPAnM24) %>%
unite(Sample_Type, Group:Treatment, sep = "_") %>%
ggplot(aes(Sample_Type, VPAnM24)) +
geom_boxplot()
vpa.med.neg.noNA %>%
select(Samples:Experiment, VPAnM24) %>%
filter(Treatment == "vpa") %>%
group_by(Group) %>%
summarize(
vpa.avg = mean(VPAnM24),
vpa.sd = sd(VPAnM24)
)
# A tibble: 2 x 3
Group vpa.avg vpa.sd
<chr> <dbl> <dbl>
1 empty 20.8 0.413
2 sample 20.8 0.506
It seems likely that VPA is not getting metabolised to a great extent, but cannot be sure.
# sample prep
vpa.cell.neg.smpl.data <- vpa.cell.neg.noNA %>%
filter(Group == "sample")
vpa.cell.neg.data.for.sva <- as.matrix(
vpa.cell.neg.smpl.data[, which(
colnames(vpa.cell.neg.smpl.data) == "VPAnC1"
):ncol(vpa.cell.neg.smpl.data)]
)
row.names(vpa.cell.neg.data.for.sva) <- vpa.cell.neg.smpl.data$Samples
vpa.cell.neg.data.for.sva <- t(vpa.cell.neg.data.for.sva)
# pheno prep
vpa.cell.neg.data.pheno <- as.data.frame(vpa.cell.neg.smpl.data[, 5:7])
row.names(vpa.cell.neg.data.pheno) <- vpa.cell.neg.smpl.data$Samples
# sva calculation
vpa.cell.neg.mod.vpa <- model.matrix(~ as.factor(Treatment), data = vpa.cell.neg.data.pheno)
vpa.cell.neg.mod0 <- model.matrix(~ 1, data = vpa.cell.neg.data.pheno)
set.seed(2018)
num.sv(vpa.cell.neg.data.for.sva, vpa.cell.neg.mod.vpa, method = "be")
[1] 3
set.seed(2018)
num.sv(vpa.cell.neg.data.for.sva, vpa.cell.neg.mod.vpa, method = "leek")
[1] 0
set.seed(2018)
vpa.cell.neg.sv <- sva(vpa.cell.neg.data.for.sva, vpa.cell.neg.mod.vpa, vpa.cell.neg.mod0)
Number of significant surrogate variables is: 3
Iteration (out of 5 ):1 2 3 4 5
# extract the surrogate variables
vpa.cell.neg.surr.var <- as.data.frame(vpa.cell.neg.sv$sv)
colnames(vpa.cell.neg.surr.var) <- c("S1", "S2", "S3")
colnames(vpa.cell.neg.mod.vpa) <- c("cntrl", "DRUGvsCNTRL")
# combine the full model matrix and the surrogate variables into one
vpa.cell.neg.d.sv <- cbind(vpa.cell.neg.mod.vpa, vpa.cell.neg.surr.var)
head(vpa.cell.neg.d.sv)
cntrl DRUGvsCNTRL S1 S2 S3
T1_P1_C1 1 0 -0.02572670 0.30588286 -0.24101509
T1_P1_C2 1 0 0.22693416 -0.04248658 -0.16668619
T1_P1_C3 1 0 0.24683594 0.01195802 -0.16215771
T1_P1_V1 1 1 0.26304093 0.05597521 0.10204063
T1_P1_V2 1 1 -0.08024336 0.19758531 0.01981105
T1_P1_V3 1 1 -0.08965598 0.26184963 -0.00716443
vpa.cell.neg.top.table <- vpa.cell.neg.data.for.sva %>%
# fit a linear model
lmFit(vpa.cell.neg.d.sv) %>%
# calculate the test statistics
eBayes() %>%
# select the top features that have a p-value < 0.05 after Bonferroni multiple hypothesis correction
topTable(coef = "DRUGvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(vpa.cell.neg.data.for.sva))
# what the result looks like:
head(vpa.cell.neg.top.table)
logFC AveExpr t P.Value adj.P.Val
VPAnC119 -1.1323186 16.34932 -16.409667 3.271285e-13 5.855600e-11
VPAnC152 -1.3534317 16.93098 -11.248220 3.410586e-10 6.104950e-08
VPAnC114 0.9294965 15.08073 10.511594 1.116320e-09 1.998213e-07
VPAnC148 -0.7049114 14.87538 -8.903691 1.840005e-08 3.293608e-06
VPAnC18 -0.6384362 21.20349 -8.657914 2.903710e-08 5.197640e-06
VPAnC97 -0.3901582 19.32692 -8.136154 7.849318e-08 1.405028e-05
B
VPAnC119 20.447355
VPAnC152 13.521747
VPAnC114 12.324752
VPAnC148 9.486352
VPAnC18 9.023421
VPAnC97 8.013953
# make result more informative
vpa.cell.neg.top.w.info <- vpa.cell.neg.top.table %>%
rownames_to_column("compound_short") %>%
mutate(
vpa_div_cntrl = 2 ^ logFC,
change_in_vpa = ifelse(vpa_div_cntrl < 1, "down", "up")
) %>%
inner_join(vpa.cell.neg.compound.info, by = "compound_short") %>%
filter(vpa_div_cntrl > 1.2 | vpa_div_cntrl < 0.83) %>%
arrange(change_in_vpa, desc(vpa_div_cntrl))
vpa.cell.neg.top.w.info %>%
select(compound_short, compound_full, change_in_vpa, vpa_div_cntrl)
compound_short compound_full change_in_vpa
1 VPAnC33 4-Oxoproline down
2 VPAnC58 N-Acetylserine down
3 VPAnC65 Xanthine down
4 VPAnC97 myo-Inositol down
5 VPAnC41 Asparagine down
6 VPAnC23 Thiosulfate down
7 VPAnC46 Deoxyribose down
8 VPAnC135 Glycerol 1-phosphoserine down
9 VPAnC18 Glyceric Acid down
10 VPAnC86 G3P down
11 VPAnC136 D-Glucose 6-phosphate down
12 VPAnC148 Sedoheptulose 7-phosphate down
13 VPAnC172 Succinoadenosine down
14 VPAnC144 Xanthosine down
15 VPAnC119 Glycerylphosphorylethanolamine down
16 VPAnC10 Lactic Acid down
17 VPAnC48 Adenine down
18 VPAnC176 dTDP down
19 VPAnC127 Ribose 5-Phosphate down
20 VPAnC152 GlcNAc-1P down
21 VPAnC159 CMP down
22 VPAnC160 UMP down
23 VPAnC138 Inosine down
24 VPAnC168 AMP down
25 VPAnC169 GMP down
26 VPAnC114 Homocitric Acid up
27 VPAnC54 Valproic acid up
28 VPAnC53 O-Phosphoethanolamine up
29 VPAnC200 ADP-Glucose up
30 VPAnC157 11Z,14Z-Eicosadienoic Acid (20:2 n-6) up
31 VPAnC189 CTP up
32 VPAnC67 3-Sulfinoalanine up
33 VPAnC44 Aspartic Acid up
34 VPAnC73 2-Aminoadipic Acid up
35 VPAnC124 Cystathionine up
36 VPAnC17 Serine up
37 VPAnC30 Threonine up
38 VPAnC2 Glycine up
vpa_div_cntrl
1 0.8053737
2 0.7968679
3 0.7784193
4 0.7630459
5 0.7359651
6 0.7195058
7 0.7080025
8 0.6729909
9 0.6424089
10 0.6384422
11 0.6149253
12 0.6134802
13 0.5099818
14 0.4621206
15 0.4561820
16 0.4510010
17 0.4284833
18 0.4207822
19 0.4146314
20 0.3913600
21 0.3771595
22 0.3242992
23 0.2866219
24 0.1530079
25 0.1220719
26 1.9046112
27 1.8136293
28 1.5898490
29 1.5724338
30 1.5286555
31 1.4873520
32 1.4829863
33 1.4483811
34 1.4198508
35 1.4098145
36 1.2661473
37 1.2384215
38 1.2253333
#write_csv(path = "./results/vpa_cell_neg_top_hits_w_FC_pval.csv", vpa.cell.neg.top.w.info)
vpa.cell.neg.gathered <- vpa.cell.neg.noNA %>%
filter(Group == "sample") %>%
bind_cols(vpa.cell.neg.surr.var) %>%
select(Samples, Treatment, S1:S3, starts_with("VPAnC")) %>%
gather(key = "Compound", value = "Abundance", VPAnC1:VPAnC99)
# structure so far:
glimpse(vpa.cell.neg.gathered)
Observations: 4,117
Variables: 7
$ Samples <chr> "T1_P1_C1", "T1_P1_C2", "T1_P1_C3", "T1_P1_V1", "T1_...
$ Treatment <chr> "cntrl", "cntrl", "cntrl", "vpa", "vpa", "vpa", "cnt...
$ S1 <dbl> -0.02572670, 0.22693416, 0.24683594, 0.26304093, -0....
$ S2 <dbl> 0.30588286, -0.04248658, 0.01195802, 0.05597521, 0.1...
$ S3 <dbl> -0.2410150862, -0.1666861888, -0.1621577120, 0.10204...
$ Compound <chr> "VPAnC1", "VPAnC1", "VPAnC1", "VPAnC1", "VPAnC1", "V...
$ Abundance <dbl> 15.08795, 14.67335, 14.81717, 14.55139, 14.79476, 15...
vpa.cell.neg.nested <- vpa.cell.neg.gathered %>%
group_by(Compound) %>%
nest() %>%
# apply a linear model to each individual compound, as a function of the surrogate variables
mutate(model = map(data, ~lm(Abundance ~ S1 + S2 + S3, data = .))) %>%
# use broom to tidy up the output
mutate(augment_model = map(model, augment))
# result to far:
vpa.cell.neg.nested
# A tibble: 179 x 4
Compound data model augment_model
<chr> <list> <list> <list>
1 VPAnC1 <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
2 VPAnC10 <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
3 VPAnC100 <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
4 VPAnC101 <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
5 VPAnC102 <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
6 VPAnC103 <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
7 VPAnC104 <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
8 VPAnC105 <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
9 VPAnC106 <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
10 VPAnC107 <tibble [23 x 6]> <S3: lm> <tibble [23 x 11]>
# ... with 169 more rows
# now to get the residuals out for each compound
vpa.cell.neg.modSV.resid <- vpa.cell.neg.nested %>%
unnest(data, augment_model) %>%
select(Samples, Treatment, Compound, .resid) %>%
# return to long format
spread(Compound, .resid)
glimpse(vpa.cell.neg.modSV.resid[, 1:5])
Observations: 23
Variables: 5
$ Samples <chr> "T1_P1_C1", "T1_P1_C2", "T1_P1_C3", "T1_P1_V1", "T1_...
$ Treatment <chr> "cntrl", "cntrl", "cntrl", "vpa", "vpa", "vpa", "cnt...
$ VPAnC1 <dbl> 0.10319936, -0.06371124, 0.04982032, -0.24595544, -0...
$ VPAnC10 <dbl> 0.29867785, 0.67292980, 0.19366552, -1.03548312, -0....
$ VPAnC100 <dbl> 0.098012517, -0.085761072, 0.123939301, 0.084645145,...
vpa.cell.neg.modSV.resid %>%
select(Samples, one_of(vpa.cell.neg.top.w.info$compound_short)) %>%
HeatmapPrepAlt("VPAnC") %>%
t() %>%
heatmaply(
colors = viridis(n = 10, option = "magma"),
xlab = "Samples", ylab = "Compounds",
main = "Statistically significant compounds\nVPA-Only Experiment / Cells / Negative Mode",
margins = c(50, 50, 75, 30),
labRow = vpa.cell.neg.top.w.info$compound_full,
k_col = 2, k_row = 2
)
### PCA - cleaned data ###
vpa.cell.neg.modSV.pca <- vpa.cell.neg.modSV.resid %>%
select(starts_with("VPAnC")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
vpa.cell.neg.modSV.pca.x <- as.data.frame(vpa.cell.neg.modSV.pca$x)
row.names(vpa.cell.neg.modSV.pca.x) <- vpa.cell.neg.modSV.resid$Samples
vpa.cell.neg.modSV.pca.x <- vpa.cell.neg.modSV.pca.x %>%
bind_cols(vpa.cell.neg.modSV.resid %>% select(Treatment))
vpa.cell.neg.modSV.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (60.5% Var)") +
ylab("PC2 (10.6% Var)")
vpa.cell.neg.modSV.pca.x %>%
ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC3 (7.6% Var)") +
ylab("PC4 (6.8% Var)")
vpa.cell.pos.smpl.data <- vpa.cell.pos.noNA %>%
filter(Group == "sample")
vpa.cell.pos.data.for.sva <- as.matrix(
vpa.cell.pos.smpl.data[, which(
colnames(vpa.cell.pos.smpl.data) == "VPApC1"
):ncol(vpa.cell.pos.smpl.data)]
)
row.names(vpa.cell.pos.data.for.sva) <- vpa.cell.pos.smpl.data$Samples
vpa.cell.pos.data.for.sva <- t(vpa.cell.pos.data.for.sva)
vpa.cell.pos.data.pheno <- as.data.frame(vpa.cell.pos.smpl.data[, 5:7])
row.names(vpa.cell.pos.data.pheno) <- vpa.cell.pos.smpl.data$Samples
# sva calculation
vpa.cell.pos.mod.vpa <- model.matrix(~ as.factor(Treatment), data = vpa.cell.pos.data.pheno)
vpa.cell.pos.mod0 <- model.matrix(~ 1, data = vpa.cell.pos.data.pheno)
set.seed(2018)
num.sv(vpa.cell.pos.data.for.sva, vpa.cell.pos.mod.vpa, method = "be")
[1] 2
set.seed(2018)
num.sv(vpa.cell.pos.data.for.sva, vpa.cell.pos.mod.vpa, method = "leek")
[1] 2
set.seed(2018)
vpa.cell.pos.sv <- sva(vpa.cell.pos.data.for.sva, vpa.cell.pos.mod.vpa, vpa.cell.pos.mod0)
Number of significant surrogate variables is: 2
Iteration (out of 5 ):1 2 3 4 5
vpa.cell.pos.surr.var <- as.data.frame(vpa.cell.pos.sv$sv)
colnames(vpa.cell.pos.surr.var) <- c("S1", "S2")
colnames(vpa.cell.pos.mod.vpa) <- c("cntrl", "DRUGvsCNTRL")
vpa.cell.pos.d.sv <- cbind(vpa.cell.pos.mod.vpa, vpa.cell.pos.surr.var)
head(vpa.cell.pos.d.sv)
cntrl DRUGvsCNTRL S1 S2
T1_P1_C1 1 0 -0.3187310 0.19917816
T1_P1_C2 1 0 -0.2279608 -0.06387899
T1_P1_C3 1 0 -0.1409242 0.01021199
T1_P1_V1 1 1 -0.2796968 -0.36213773
T1_P1_V2 1 1 -0.1999223 -0.13213535
T1_P1_V3 1 1 -0.2955297 0.16841744
vpa.cell.pos.top.table <- vpa.cell.pos.data.for.sva %>%
lmFit(vpa.cell.pos.d.sv) %>%
eBayes() %>%
topTable(coef = "DRUGvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(vpa.cell.pos.data.for.sva))
vpa.cell.pos.top.w.info <- vpa.cell.pos.top.table %>%
rownames_to_column("compound_short") %>%
mutate(
vpa_div_cntrl = 2 ^ logFC,
change_in_vpa = ifelse(vpa_div_cntrl < 1, "down", "up")
) %>%
inner_join(vpa.cell.pos.compound.info, by = "compound_short") %>%
filter(vpa_div_cntrl > 1.2 | vpa_div_cntrl < 0.83) %>%
arrange(change_in_vpa, desc(vpa_div_cntrl))
vpa.cell.pos.top.w.info %>%
select(compound_short, compound_full, change_in_vpa, vpa_div_cntrl)
compound_short compound_full change_in_vpa
1 VPApC35 Asparagine down
2 VPApC40 N-Methylnicotinate down
3 VPApC51 Xanthine down
4 VPApC101 Glycerol 1-phosphoserine down
5 VPApC24 Nicotinamide down
6 VPApC49 D-Ribose down
7 VPApC89 GlcNAc down
8 VPApC67 G3P down
9 VPApC112 Xanthosine down
10 VPApC38 Adenine down
11 VPApC39 Hypoxanthine down
12 VPApC119 CMP down
13 VPApC14 Uracil down
14 VPApC134 ADP down
15 VPApC107 Inosine down
16 VPApC85 Glycerylphosphorylethanolamine down
17 VPApC60 Fucose down
18 VPApC95 Uridine down
19 VPApC74 D-Galactitol down
20 VPApC120 UMP down
21 VPApC121 NMN down
22 VPApC100 Glycerol-3-phosphocholine down
23 VPApC17 Proline up
24 VPApC37 Aspartic Acid up
25 VPApC52 3-Sulfinoalanine up
26 VPApC172 PC(36:4) up
27 VPApC159 ADP-Glucose up
28 VPApC41 O-Phosphoethanolamine up
29 VPApC56 Methyl-L-Lysine up
30 VPApC90 Cystathionine up
31 VPApC105 Thiamine up
32 VPApC171 PC(35:2) up
vpa_div_cntrl
1 0.8020385
2 0.7628304
3 0.7304178
4 0.6439406
5 0.6390530
6 0.6253119
7 0.6142356
8 0.6080481
9 0.5521731
10 0.4588743
11 0.4538572
12 0.4491053
13 0.4414234
14 0.4391555
15 0.4284642
16 0.4242119
17 0.4078264
18 0.4012096
19 0.3819708
20 0.3793175
21 0.3466167
22 0.1517526
23 1.8933351
24 1.5849782
25 1.5334580
26 1.4855587
27 1.4155297
28 1.3495668
29 1.3212269
30 1.2885595
31 1.2460124
32 1.2131600
#write_csv(path = "./results/vpa_cell_pos_top_hits_w_FC_pval.csv", vpa.cell.pos.top.w.info)
vpa.cell.pos.gathered <- vpa.cell.pos.noNA %>%
filter(Group == "sample") %>%
bind_cols(vpa.cell.pos.surr.var) %>%
select(Samples, Treatment, S1:S2, starts_with("VPApC")) %>%
gather(key = "Compound", value = "Abundance", VPApC1:VPApC98)
vpa.cell.pos.nested <- vpa.cell.pos.gathered %>%
group_by(Compound) %>%
nest() %>%
mutate(model = map(data, ~lm(Abundance ~ S1 + S2, data = .))) %>%
mutate(augment_model = map(model, augment))
vpa.cell.pos.modSV.resid <- vpa.cell.pos.nested %>%
unnest(data, augment_model) %>%
select(Samples, Treatment, Compound, .resid) %>%
spread(Compound, .resid)
vpa.cell.pos.modSV.resid %>%
select(Samples, one_of(vpa.cell.pos.top.w.info$compound_short)) %>%
HeatmapPrepAlt("VPApC") %>%
t() %>%
heatmaply(
colors = viridis(n = 10, option = "magma"),
xlab = "Samples", ylab = "Compounds",
main = "Statistically significant compounds\nVPA-Only Experiment / Cells / Positive Mode",
margins = c(50, 50, 75, 30),
labRow = vpa.cell.pos.top.w.info$compound_full,
k_row = 2, k_col = 2
)
### PCA - cleaned data ###
vpa.cell.pos.modSV.pca <- vpa.cell.pos.modSV.resid %>%
select(starts_with("VPApC")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
vpa.cell.pos.modSV.pca.x <- as.data.frame(vpa.cell.pos.modSV.pca$x)
row.names(vpa.cell.pos.modSV.pca.x) <- vpa.cell.pos.modSV.resid$Samples
vpa.cell.pos.modSV.pca.x <- vpa.cell.pos.modSV.pca.x %>%
bind_cols(vpa.cell.pos.modSV.resid %>% select(Treatment))
vpa.cell.pos.modSV.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (65.4% Var)") +
ylab("PC2 (12.4% Var)")
vpa.cell.pos.modSV.pca.x %>%
ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC3 (6.5% Var)") +
ylab("PC4 (4.5% Var)")
vpa.med.neg.smpl.data <- vpa.med.neg.noNA %>%
filter(Group == "sample")
vpa.med.neg.data.for.sva <- as.matrix(
vpa.med.neg.smpl.data[, which(
colnames(vpa.med.neg.smpl.data) == "VPAnM1"
):ncol(vpa.med.neg.smpl.data)]
)
row.names(vpa.med.neg.data.for.sva) <- vpa.med.neg.smpl.data$Samples
vpa.med.neg.data.for.sva <- t(vpa.med.neg.data.for.sva)
vpa.med.neg.data.pheno <- as.data.frame(vpa.med.neg.smpl.data[, 5:7])
row.names(vpa.med.neg.data.pheno) <- vpa.med.neg.smpl.data$Samples
vpa.med.neg.mod.vpa <- model.matrix(~ as.factor(Treatment), data = vpa.med.neg.data.pheno)
vpa.med.neg.mod0 <- model.matrix(~ 1, data = vpa.med.neg.data.pheno)
set.seed(2018)
num.sv(vpa.med.neg.data.for.sva, vpa.med.neg.mod.vpa, method = "be")
[1] 1
set.seed(2018)
num.sv(vpa.med.neg.data.for.sva, vpa.med.neg.mod.vpa, method = "leek")
[1] 0
set.seed(2018)
vpa.med.neg.sv <- sva(vpa.med.neg.data.for.sva, vpa.med.neg.mod.vpa, vpa.med.neg.mod0)
Number of significant surrogate variables is: 1
Iteration (out of 5 ):1 2 3 4 5
vpa.med.neg.surr.var <- as.data.frame(vpa.med.neg.sv$sv)
colnames(vpa.med.neg.surr.var) <- c("S1")
colnames(vpa.med.neg.mod.vpa) <- c("cntrl", "DRUGvsCNTRL")
vpa.med.neg.d.sv <- cbind(vpa.med.neg.mod.vpa, vpa.med.neg.surr.var)
head(vpa.med.neg.d.sv)
cntrl DRUGvsCNTRL S1
T1_P1_C1 1 0 -0.11876454
T1_P1_C2 1 0 -0.16142883
T1_P1_C3 1 0 -0.36348674
T1_P1_V1 1 1 -0.07160605
T1_P1_V2 1 1 0.16394296
T1_P1_V3 1 1 0.23197510
vpa.med.neg.top.table <- vpa.med.neg.data.for.sva %>%
lmFit(vpa.med.neg.d.sv) %>%
eBayes() %>%
topTable(coef = "DRUGvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(vpa.med.neg.data.for.sva))
vpa.med.neg.top.w.info <- vpa.med.neg.top.table %>%
rownames_to_column("compound_short") %>%
mutate(
vpa_div_cntrl = 2 ^ logFC,
change_in_vpa = ifelse(vpa_div_cntrl < 1, "down", "up")
) %>%
inner_join(vpa.med.neg.compound.info, by = "compound_short") %>%
filter(vpa_div_cntrl > 1.2 | vpa_div_cntrl < 0.83) %>%
arrange(change_in_vpa, desc(vpa_div_cntrl))
vpa.med.neg.top.w.info %>%
select(compound_short, compound_full, change_in_vpa, vpa_div_cntrl)
compound_short compound_full change_in_vpa vpa_div_cntrl
1 VPAnM49 Arachidonic Acid down 0.7855066
2 VPAnM24 Valproic acid up 32.7464129
vpa.med.neg.gathered <- vpa.med.neg.noNA %>%
filter(Group == "sample") %>%
bind_cols(vpa.med.neg.surr.var) %>%
select(Samples, Treatment, S1, starts_with("VPAnM")) %>%
gather(key = "Compound", value = "Abundance", VPAnM1:VPAnM9)
vpa.med.neg.nested <- vpa.med.neg.gathered %>%
group_by(Compound) %>%
nest() %>%
mutate(model = map(data, ~lm(Abundance ~ S1, data = .))) %>%
mutate(augment_model = map(model, augment))
vpa.med.neg.modSV.resid <- vpa.med.neg.nested %>%
unnest(data, augment_model) %>%
select(Samples, Treatment, Compound, .resid) %>%
spread(Compound, .resid)
vpa.med.neg.modSV.resid %>%
select(Samples, one_of(vpa.med.neg.top.w.info$compound_short)) %>%
HeatmapPrepAlt("VPAnM") %>%
t() %>%
heatmaply(
colors = viridis(n = 10, option = "magma"),
xlab = "Samples", ylab = "Compounds",
main = "Statistically significant compounds\nVPA-Only Experiment / Media / Negative Mode",
margins = c(50, 50, 75, 30),
labRow = vpa.med.neg.top.w.info$compound_full,
k_col = 2, k_row = 2
)
### PCA - cleaned data ###
vpa.med.neg.modSV.pca <- vpa.med.neg.modSV.resid %>%
select(starts_with("VPAnM")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
vpa.med.neg.modSV.pca.x <- as.data.frame(vpa.med.neg.modSV.pca$x)
row.names(vpa.med.neg.modSV.pca.x) <- vpa.med.neg.modSV.resid$Samples
vpa.med.neg.modSV.pca.x <- vpa.med.neg.modSV.pca.x %>%
bind_cols(vpa.med.neg.modSV.resid %>% select(Treatment))
vpa.med.neg.modSV.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (67.9% Var)") +
ylab("PC2 (18.6% Var)")
vpa.med.neg.modSV.pca.x %>%
ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC3 (5.4% Var)") +
ylab("PC4 (3.4% Var)")
vpa.med.pos.smpl.data <- vpa.med.pos.noNA %>%
filter(Group == "sample")
vpa.med.pos.data.for.sva <- as.matrix(
vpa.med.pos.smpl.data[, which(
colnames(vpa.med.pos.smpl.data) == "VPApM1"
):ncol(vpa.med.pos.smpl.data)]
)
row.names(vpa.med.pos.data.for.sva) <- vpa.med.pos.smpl.data$Samples
vpa.med.pos.data.for.sva <- t(vpa.med.pos.data.for.sva)
vpa.med.pos.data.pheno <- as.data.frame(vpa.med.pos.smpl.data[, 5:7])
row.names(vpa.med.pos.data.pheno) <- vpa.med.pos.smpl.data$Samples
vpa.med.pos.mod.vpa <- model.matrix(~ as.factor(Treatment), data = vpa.med.pos.data.pheno)
vpa.med.pos.mod0 <- model.matrix(~ 1, data = vpa.med.pos.data.pheno)
set.seed(2018)
num.sv(vpa.med.pos.data.for.sva, vpa.med.pos.mod.vpa, method = "be")
[1] 1
set.seed(2018)
num.sv(vpa.med.pos.data.for.sva, vpa.med.pos.mod.vpa, method = "leek")
[1] 0
set.seed(2018)
vpa.med.pos.sv <- sva(vpa.med.pos.data.for.sva, vpa.med.pos.mod.vpa, vpa.med.pos.mod0)
Number of significant surrogate variables is: 1
Iteration (out of 5 ):1 2 3 4 5
vpa.med.pos.surr.var <- as.data.frame(vpa.med.pos.sv$sv)
colnames(vpa.med.pos.surr.var) <- c("S1")
colnames(vpa.med.pos.mod.vpa) <- c("cntrl", "DRUGvsCNTRL")
vpa.med.pos.d.sv <- cbind(vpa.med.pos.mod.vpa, vpa.med.pos.surr.var)
head(vpa.med.pos.d.sv)
cntrl DRUGvsCNTRL S1
T1_P1_C1 1 0 0.05493060
T1_P1_C2 1 0 0.19001851
T1_P1_C3 1 0 0.35675663
T1_P1_V1 1 1 0.02929437
T1_P1_V2 1 1 -0.20959541
T1_P1_V3 1 1 -0.24263772
vpa.med.pos.top.table <- vpa.med.pos.data.for.sva %>%
lmFit(vpa.med.pos.d.sv) %>%
eBayes() %>%
topTable(coef = "DRUGvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(vpa.med.pos.data.for.sva))
vpa.med.pos.top.table
data frame with 0 columns and 0 rows
head(vpa.cell.neg.top.w.info)
compound_short logFC AveExpr t P.Value adj.P.Val
1 VPAnC33 -0.3122697 23.94261 -4.541768 1.904961e-04 3.409880e-02
2 VPAnC58 -0.3275876 17.48468 -4.542368 1.902279e-04 3.405079e-02
3 VPAnC65 -0.3613807 20.56905 -4.542536 1.901525e-04 3.403730e-02
4 VPAnC97 -0.3901582 19.32692 -8.136154 7.849318e-08 1.405028e-05
5 VPAnC41 -0.4422907 18.18660 -7.076306 6.614607e-07 1.184015e-04
6 VPAnC23 -0.4749217 18.27583 -6.244306 3.915075e-06 7.007984e-04
B vpa_div_cntrl change_in_vpa compound_full formula mass
1 0.1367131 0.8053737 down 4-Oxoproline C5 H7 N O3 129.043
2 0.1381199 0.7968679 down N-Acetylserine C5 H9 N O4 147.054
3 0.1385153 0.7784193 down Xanthine C5 H4 N4 O2 152.034
4 8.0139530 0.7630459 down myo-Inositol C6 H12 O6 180.064
5 5.8501353 0.7359651 down Asparagine C4 H8 N2 O3 132.054
6 4.0475648 0.7195058 down Thiosulfate H2 O3 S2 113.945
rt cas_id
1 5.98 4347-18-6
2 3.38 16354-58-8
3 1.92 69-89-6
4 5.02 87-89-8
5 8.37 70-47-3
6 4.85 14383-50-7
head(vpa.cell.pos.top.w.info)
compound_short logFC AveExpr t P.Value adj.P.Val
1 VPApC35 -0.3182566 18.67222 -4.853699 7.687773e-05 0.010916637
2 VPApC40 -0.3905657 15.41180 -4.621143 1.350775e-04 0.019181004
3 VPApC51 -0.4532062 16.91997 -4.489349 1.860706e-04 0.026422023
4 VPApC101 -0.6350004 14.04687 -5.753222 8.991813e-06 0.001276837
5 VPApC24 -0.6459926 19.38535 -4.275739 3.128901e-04 0.044430397
6 VPApC49 -0.6773521 14.61980 -4.311187 2.870291e-04 0.040758128
B vpa_div_cntrl change_in_vpa compound_full
1 0.58407812 0.8020385 down Asparagine
2 0.01844396 0.7628304 down N-Methylnicotinate
3 -0.30211613 0.7304178 down Xanthine
4 2.75153845 0.6439406 down Glycerol 1-phosphoserine
5 -0.82081936 0.6390530 down Nicotinamide
6 -0.73485577 0.6253119 down D-Ribose
formula mass rt cas_id
1 C4 H8 N2 O3 132.054 8.39 70-47-3
2 C7 H7 N O2 137.048 10.39 535-83-1
3 C5 H4 N4 O2 152.034 1.92 69-89-6
4 C6 H14 N O8 P 259.045 8.00 26289-09-8
5 C6 H6 N2 O 122.048 1.90 98-92-0
6 C5 H10 O5 150.052 2.45 50-69-1
head(vpa.med.neg.top.w.info)
compound_short logFC AveExpr t P.Value adj.P.Val
1 VPAnM49 -0.3483048 14.19924 -4.089121 4.528439e-04 1.856660e-02
2 VPAnM24 5.0332650 18.25088 50.453771 5.235543e-25 2.146573e-23
B vpa_div_cntrl change_in_vpa compound_full formula
1 -2.342467 0.7855066 down Arachidonic Acid C20 H32 O2
2 47.332801 32.7464129 up Valproic acid C8 H16 O2
mass rt cas_id
1 304.241 1.17 506-32-1
2 144.115 1.30 99-66-1
vpa.full.hit.list <- vpa.cell.neg.top.w.info %>%
mutate(Mode = "cell.neg") %>%
bind_rows(
vpa.cell.pos.top.w.info %>%
mutate(Mode = "cell.pos")
) %>%
bind_rows(
vpa.med.neg.top.w.info %>%
mutate(Mode = "med.neg")
) %>%
as.tibble()
vpa.sml.hit.list <- vpa.full.hit.list %>%
select(compound_full:formula, cas_id) %>%
distinct() %>%
arrange(compound_full) %>%
left_join(cmpnd.id.db, by = "cas_id")
all.equal(vpa.sml.hit.list$compound_full.x, vpa.sml.hit.list$compound_full.y)
[1] TRUE
#write_csv(path = "./results/vpa_only_exp_hits_w_kegg.csv", vpa.sml.hit.list)
glimpse(vpa.sml.hit.list)
Observations: 56
Variables: 7
$ compound_full.x <chr> "11Z,14Z-Eicosadienoic Acid (20:2 n-6)", "2-Am...
$ formula <chr> "C20 H36 O2", "C6 H11 N O4", "C3 H7 N O4 S", "...
$ cas_id <chr> "2091-39-6", "542-32-5", "1115-65-7", "4347-18...
$ compound_full.y <chr> "11Z,14Z-Eicosadienoic Acid (20:2 n-6)", "2-Am...
$ HMDB <chr> "HMDB05060", "HMDB00510", "HMDB00996", "None",...
$ Metlin <chr> "62964", "3271", "5927", "63471", "85", "34522...
$ KEGG <chr> "C16525", "C00956", "C00606", "C01877", "C0014...
nucleotide.metab <- read_csv("./data/pathways/vpa_only_exp_nucleotide_hits.csv")
### Purine Metabolism ###
purine.triphosphates <- c("ATP", "GTP", "dATP", "dGTP")
purine.for.plot <- nucleotide.metab %>%
inner_join(vpa.full.hit.list, by = "cas_id") %>%
filter(path1 == "Purine" | path2 == "Purine" | path3 == "Purine") %>%
select(compound_full.x, compound_short) %>%
rename(compound_full = compound_full.x) %>%
bind_rows(
vpa.cell.neg.compound.info %>%
filter(compound_full %in% purine.triphosphates) %>%
select(compound_full, compound_short)
) %>%
bind_rows(
vpa.cell.pos.compound.info %>%
filter(compound_full %in% purine.triphosphates) %>%
select(compound_full, compound_short)
)
#write_csv(path = "./data/pathways/purine_pathway.csv", purine.for.plot) do not run
purine.plot.order <- read_csv("./data/pathways/purine_pathway.csv") %>%
mutate(plot_order = factor(Name, levels = Name))
purine.data <- vpa.cell.neg.modSV.resid %>%
inner_join(vpa.cell.pos.modSV.resid %>% select(-Treatment), by = "Samples") %>%
select(Samples:Treatment, one_of(purine.for.plot$compound_short)) %>%
mutate_at(vars(matches("VPA")), scale)
purine.sig.plot <- purine.data %>%
gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>%
inner_join(
purine.plot.order %>%
filter(compound_full != "Aspartic Acid" & compound_full != "Ribose 5-Phosphate"),
by = "compound_short"
) %>%
filter(Sig == "Y") %>%
ggplot(aes(plot_order, Abundance, color = Treatment)) +
geom_boxplot(position = position_dodge(0.8)) +
geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
panel.background = element_rect(fill = "gray90")
) +
xlab("") +
ylab("") +
# ggtitle("Normalized Abundance by Treatment Group\nPyrimidine Metabolism\nStatistically Significant Compounds") +
scale_color_manual(values = c("#56B4E9", "#E69F00"), labels = c("Control", "VPA")) +
ylim(-2.5, 2.5)
purine.sig.plot
purine.not.sig.plot <- purine.data %>%
gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>%
inner_join(purine.plot.order, by = "compound_short") %>%
filter(Sig == "N") %>%
ggplot(aes(plot_order, Abundance, color = Treatment)) +
geom_boxplot(position = position_dodge(0.8)) +
geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
panel.background = element_rect(fill = "gray90")
) +
xlab("") +
ylab("") +
#ggtitle("Normalized Abundance by Treatment Group\nPyrimidine Metabolism\nNot Statistically Significant Compounds") +
scale_color_manual(values = c("#56B4E9", "#E69F00"), labels = c("Control", "VPA")) +
ylim(-2.5, 2.5)
purine.not.sig.plot
### Pyrimidine Metabolism ###
pyrimidine.triphosphates <- c("UTP", "CTP", "dTTP", "dCTP")
pyrimidine.for.plot <- nucleotide.metab %>%
inner_join(vpa.full.hit.list, by = "cas_id") %>%
filter(path1 == "Pyrimidine" | path2 == "Pyrimidine" | path3 == "Pyrimidine") %>%
select(compound_full.x, compound_short) %>%
rename(compound_full = compound_full.x) %>%
bind_rows(
vpa.cell.neg.compound.info %>%
filter(compound_full %in% pyrimidine.triphosphates) %>%
select(compound_full, compound_short)
) %>%
bind_rows(
vpa.cell.pos.compound.info %>%
filter(compound_full %in% pyrimidine.triphosphates) %>%
select(compound_full, compound_short)
) %>%
distinct()
#write_csv(path = "./data/pathways/pyrimidine_pathway.csv", pyrimidine.for.plot)
pyrimidine.plot.order <- read_csv("./data/pathways/pyrimidine_pathway.csv") %>%
mutate(plot_order = factor(Name, levels = Name))
pyrimidine.data <- vpa.cell.neg.modSV.resid %>%
inner_join(vpa.cell.pos.modSV.resid %>% select(-Treatment), by = "Samples") %>%
select(Samples:Treatment, one_of(pyrimidine.for.plot$compound_short)) %>%
mutate_at(vars(matches("VPA")), scale)
pyrimidine.sig.plot <- pyrimidine.data %>%
gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>%
inner_join(
pyrimidine.plot.order %>%
filter(compound_full != "Ribose 5-Phosphate"), by = "compound_short") %>%
filter(Sig == "Y") %>%
ggplot(aes(plot_order, Abundance, color = Treatment)) +
geom_boxplot(position = position_dodge(0.8)) +
geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
panel.background = element_rect(fill = "gray90")
) +
xlab("") +
ylab("") +
#ggtitle("Normalized Abundance by Treatment Group\nPyrimidine Metabolism\nStatistically Significant Compounds") +
scale_color_manual(values = c("#56B4E9", "#E69F00"), labels = c("Control", "VPA")) +
ylim(-2.5, 2.5)
pyrimidine.sig.plot
pyrimidine.not.sig.plot <- pyrimidine.data %>%
gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>%
inner_join(pyrimidine.plot.order, by = "compound_short") %>%
filter(Sig == "N") %>%
ggplot(aes(plot_order, Abundance, color = Treatment)) +
geom_boxplot(position = position_dodge(0.8)) +
geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
panel.background = element_rect(fill = "gray90")
) +
xlab("") +
ylab("") +
#ggtitle("Normalized Abundance by Treatment Group\nPyrimidine Metabolism\nNot Statistically Significant Compounds") +
scale_color_manual(values = c("#56B4E9", "#E69F00"), labels = c("Control", "VPA")) +
ylim(-2.5, 2.5)
pyrimidine.not.sig.plot
### Pentose Phosphate Pathway ###
ppp.for.plot <- nucleotide.metab %>%
inner_join(vpa.full.hit.list, by = "cas_id") %>%
filter(path1 == "PPP" | path2 == "PPP" | path3 == "PPP") %>%
select(compound_full.x, compound_short) %>%
rename(compound_full = compound_full.x)
#write_csv(path = "./data/pathways/ppp_pathway.csv", ppp.for.plot)
ppp.plot.order <- read_csv("./data/pathways/ppp_pathway.csv") %>%
mutate(plot_order = factor(Name, levels = Name))
ppp.data <- vpa.cell.neg.modSV.resid %>%
inner_join(vpa.cell.pos.modSV.resid %>% select(-Treatment), by = "Samples") %>%
select(Samples:Treatment, one_of(ppp.for.plot$compound_short)) %>%
mutate_at(vars(matches("VPA")), scale)
ppp.sig.plot <- ppp.data %>%
gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>%
inner_join(ppp.plot.order, by = "compound_short") %>%
ggplot(aes(plot_order, Abundance, color = Treatment)) +
geom_boxplot(position = position_dodge(0.8)) +
geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
panel.background = element_rect(fill = "gray90"),
legend.justification = "top"
) +
xlab("") +
ylab("") +
#ggtitle("Normalized Abundance by Treatment Group\nppp Metabolism\nStatistically Significant Compounds") +
scale_color_manual(values = c("#56B4E9", "#E69F00"), labels = c("Control", "VPA")) +
ylim(-2.75, 2.5) +
scale_x_discrete(labels = c("Glucose\n6-Phosphate", "Ribose", "Deoxyribose", "Glyceraldehyde\n3-Phosphate (Neg)", "Glyceraldehyde\n3-Phosphate (Pos)", "Ribose\n5-Phosphate", "Sedoheptulose\n7-Phosphate"))
ppp.sig.plot
### all together
nuc.legend <- get_legend(ppp.sig.plot)
vpa.only.nuc.sig.plot.grid <- plot_grid(
purine.sig.plot + theme(legend.position = "none", plot.margin = unit(c(6,6,0,0), "pt")),
pyrimidine.sig.plot + theme(legend.position = "none", plot.margin = unit(c(0,6,0,0), "pt")),
plot_grid(
ppp.sig.plot + theme(legend.position = "none", plot.margin = unit(c(0,6,0,0), "pt")),
nuc.legend,
rel_widths = c(1.45, 0.55)
),
ncol = 1, labels = c("A", "B", "C"),
rel_heights = c(1, 1, 1)
)
vpa.only.nuc.sig.plot.grid
#save_plot("./results/vpa_only_exp_nuc_sig.png", vpa.only.nuc.sig.plot.grid, base_height = 10, base_width = 8)
nuc.not.sig.row.plots <- plot_grid(
purine.not.sig.plot,
pyrimidine.not.sig.plot,
labels = c("A", "B"),
ncol = 1
)
nuc.not.sig.row.plots
#save_plot("./results/vpa_only_exp_nuc_not_sig.png", nuc.not.sig.row.plots, base_width = 8, base_height = 8)
sessionInfo()
R version 3.5.1 (2018-07-02)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.1252
[2] LC_CTYPE=English_United States.1252
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] bindrcpp_0.2.2 ggridges_0.5.1 broom_0.5.0
[4] limma_3.36.5 sva_3.28.0 BiocParallel_1.14.2
[7] genefilter_1.62.0 mgcv_1.8-25 nlme_3.1-137
[10] heatmaply_0.15.2 viridis_0.5.1 viridisLite_0.3.0
[13] plotly_4.8.0 GGally_1.4.0 cowplot_0.9.3
[16] forcats_0.3.0 stringr_1.3.1 dplyr_0.7.8
[19] purrr_0.2.5 readr_1.1.1 tidyr_0.8.2
[22] tibble_1.4.2 ggplot2_3.1.0 tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] colorspace_1.3-2 class_7.3-14 modeltools_0.2-22
[4] mclust_5.4.1 rprojroot_1.3-2 rstudioapi_0.8
[7] flexmix_2.3-14 bit64_0.9-7 fansi_0.4.0
[10] AnnotationDbi_1.42.1 mvtnorm_1.0-8 lubridate_1.7.4
[13] xml2_1.2.0 splines_3.5.1 codetools_0.2-15
[16] robustbase_0.93-3 knitr_1.20 jsonlite_1.5
[19] annotate_1.58.0 cluster_2.0.7-1 kernlab_0.9-27
[22] shiny_1.2.0 compiler_3.5.1 httr_1.3.1
[25] backports_1.1.2 assertthat_0.2.0 Matrix_1.2-15
[28] lazyeval_0.2.1 cli_1.0.1 later_0.7.5
[31] htmltools_0.3.6 tools_3.5.1 gtable_0.2.0
[34] glue_1.3.0 reshape2_1.4.3 Rcpp_1.0.0
[37] Biobase_2.40.0 cellranger_1.1.0 trimcluster_0.1-2.1
[40] gdata_2.18.0 crosstalk_1.0.0 iterators_1.0.10
[43] fpc_2.1-11.1 rvest_0.3.2 mime_0.6
[46] gtools_3.8.1 XML_3.98-1.16 dendextend_1.9.0
[49] DEoptimR_1.0-8 MASS_7.3-51.1 scales_1.0.0
[52] TSP_1.1-6 promises_1.0.1 hms_0.4.2
[55] parallel_3.5.1 RColorBrewer_1.1-2 yaml_2.2.0
[58] memoise_1.1.0 gridExtra_2.3 reshape_0.8.8
[61] stringi_1.2.4 RSQLite_2.1.1 gclus_1.3.1
[64] S4Vectors_0.18.3 foreach_1.4.4 seriation_1.2-3
[67] caTools_1.17.1.1 BiocGenerics_0.26.0 matrixStats_0.54.0
[70] rlang_0.3.0.1 pkgconfig_2.0.2 prabclus_2.2-6
[73] bitops_1.0-6 evaluate_0.12 lattice_0.20-38
[76] bindr_0.1.1 labeling_0.3 htmlwidgets_1.3
[79] bit_1.1-14 tidyselect_0.2.5 plyr_1.8.4
[82] magrittr_1.5 R6_2.3.0 IRanges_2.14.12
[85] gplots_3.0.1 DBI_1.0.0 pillar_1.3.0
[88] haven_1.1.2 whisker_0.3-2 withr_2.1.2
[91] survival_2.43-1 RCurl_1.95-4.11 nnet_7.3-12
[94] modelr_0.1.2 crayon_1.3.4 utf8_1.1.4
[97] KernSmooth_2.23-15 rmarkdown_1.10 grid_3.5.1
[100] readxl_1.1.0 data.table_1.11.8 blob_1.1.1
[103] digest_0.6.18 diptest_0.75-7 webshot_0.5.1
[106] xtable_1.8-3 httpuv_1.4.5 stats4_3.5.1
[109] munsell_0.5.0 registry_0.5